#!/bin/sh
# Job yaml can send below ENVIRONMENT VARIABLES to me
# - timeout
# - interval
# END ENVIRONMENT VARIABLES

. $LKP_SRC/lib/job.sh
load_job_vars LKP_SERVER

. $LKP_SRC/lib/common.sh
. $LKP_SRC/lib/wait.sh
. $LKP_SRC/lib/http.sh
. $LKP_SRC/lib/job-init.sh
. $LKP_SRC/lib/sysinfo.sh

: "${timeout:=200}"
: "${interval:=900}"

setup_wait

force_reboot()
{
	echo s > /proc/sysrq-trigger
	sleep 5
	echo b > /proc/sysrq-trigger
}

nfs_hang()
{
	local loadavg
	local misc

	read loadavg misc < /proc/loadavg
	[ ${loadavg%.*} -ge $((3 * nr_cpu + 10)) ] && {
	# If loadavg is high, but cpu ulilization(without counting iowait time)
	# is low, it indicates many jobs are waiting for I/O(state D), may due
	# to nfs hang.
		calc_cpu_utilization
		[ $cpu_utilization -ge 5 ] && return 1
	}
	for i in $(seq 3); do
		if has_cmd timeout; then
			timeout $timeout df > /dev/null
			[ "$?" -eq 124 ] || return 1
		else
			(df > /dev/null) &
			sleep $timeout
			test -d /proc/$! || return 1
		fi
	done

	# If ping failed -- possibly due to physical cable loose, network
	# switch crashed, LKP server down, etc. -- it probably won't help
	# to reboot (and keep rebooting).
	ping -c 5 -q $LKP_SERVER > /dev/null || return 1

	echo "nfs hang, loadavg: $loadavg, cpu utilization: $(cpu_utilization)%"
	return 0
}

nfs_hang_watchdog()
{
	while :
	do
		wait_timeout $interval

		if nfs_hang; then
			set_job_state "nfs_hang"
			echo "NFS hang! Force reboot!" 1>&2
			force_reboot
		fi
	done
}

nfs_hang_watchdog
